In this notebook I download and unzip the Ford Go Bike data.
#rm(list = ls())
library(pacman)
p_load(tidyverse, tictoc, ggmap, skimr, lubridate, forcats, biganalytics, doParallel)
#tictoc: tic() before chunk and toc() after provides how long a chunc of code takes to run
Downloading the data directly from https://s3.amazonaws.com/fordgobike-data
#2017 includes all months into a single file
URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv"
download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl")
#in 2018, data was seperated by month so I loop over to download everything from January to Jully
for (i in 1:7) {
URL <- paste0("https://s3.amazonaws.com/fordgobike-data/20180",i,"-fordgobike-tripdata.csv.zip")
download.file(URL, destfile = paste0("./data/20180",i,"-fordgobike-tripdata.csv.zip"), method="curl")
}
Unzip downloaded files.
unzip("./data/201801-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201801-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201802-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201802-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201803-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201803-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201804-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201804-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201805-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201805-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201806-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201806-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201807-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201807-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
Clean up data directory.
fn <- "./data/201801-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201802-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201803-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201804-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201805-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201806-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201807-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
Read the.csv files
fordgobike2017 <- read_csv(file="./data/2017-fordgobike-tripdata.csv")
fordgobike201801 <- read_csv(file="./data/201801-fordgobike-tripdata.csv")
fordgobike201802 <- read_csv(file="./data/201802-fordgobike-tripdata.csv")
fordgobike201803 <- read_csv(file="./data/201803-fordgobike-tripdata.csv")
fordgobike201804 <- read_csv(file="./data/201804-fordgobike-tripdata.csv")
fordgobike201805 <- read_csv(file="./data/201805-fordgobike-tripdata.csv")
fordgobike201806 <- read_csv(file="./data/201806-fordgobike-tripdata.csv")
fordgobike201807 <- read_csv(file="./data/201807-fordgobike-tripdata.csv")
Check the head() of the loaded data.frames
head(fordgobike2017,3)
head(fordgobike201801,3)
head(fordgobike201802,3)
head(fordgobike201803,3)
head(fordgobike201804,3)
head(fordgobike201805,3)
head(fordgobike201806,3)
head(fordgobike201807,3)
Check the tail() of the loaded data.frames.
tail(fordgobike2017,3)
tail(fordgobike201801,3)
tail(fordgobike201802,3)
tail(fordgobike201803,3)
tail(fordgobike201804,3)
tail(fordgobike201805,3)
tail(fordgobike201806,3)
tail(fordgobike201807,3)
Check the dimension (number of rows and columns) of the data
dim(fordgobike2017)
## [1] 519700 15
dim(fordgobike201801)
## [1] 94802 16
dim(fordgobike201802)
## [1] 106718 16
dim(fordgobike201803)
## [1] 111382 16
dim(fordgobike201804)
## [1] 131169 16
dim(fordgobike201805)
## [1] 179125 16
dim(fordgobike201806)
## [1] 195968 16
dim(fordgobike201807)
## [1] 199222 16
Change the data types in 2017 variables to be uniform and then merge everything
fordgobike201806 <- fordgobike201806 %>%
mutate(start_station_id = as.integer(start_station_id),
end_station_id= as.integer(end_station_id) )
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
fordgobike201807 <- fordgobike201807 %>%
mutate(start_station_id = as.integer(start_station_id),
end_station_id= as.integer(end_station_id) )
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
fordgobike2018 <- bind_rows(fordgobike201801, fordgobike201802, fordgobike201803,
fordgobike201804,fordgobike201805, fordgobike201806,
fordgobike201807)
glimpse(fordgobike2018)
## Observations: 1,018,386
## Variables: 16
## $ duration_sec <dbl> 75284, 85422, 71576, 61076, 39966, 6477, 453,…
## $ start_time <dttm> 2018-01-31 22:52:35, 2018-01-31 16:13:34, 20…
## $ end_time <dttm> 2018-02-01 19:47:19, 2018-02-01 15:57:17, 20…
## $ start_station_id <dbl> 120, 15, 304, 75, 74, 236, 110, 81, 134, 305,…
## $ start_station_name <chr> "Mission Dolores Park", "San Francisco Ferry …
## $ start_station_latitude <dbl> 37.76142, 37.79539, 37.34876, 37.77379, 37.77…
## $ start_station_longitude <dbl> -122.4264, -122.3942, -121.8948, -122.4212, -…
## $ end_station_id <dbl> 285, 15, 296, 47, 19, 160, 134, 93, 4, 317, 4…
## $ end_station_name <chr> "Webster St at O'Farrell St", "San Francisco …
## $ end_station_latitude <dbl> 37.78352, 37.79539, 37.32600, 37.78095, 37.78…
## $ end_station_longitude <dbl> -122.4312, -122.3942, -121.8771, -122.3997, -…
## $ bike_id <dbl> 2765, 2815, 3039, 321, 617, 1306, 3571, 1403,…
## $ user_type <chr> "Subscriber", "Customer", "Customer", "Custom…
## $ member_birth_year <dbl> 1986, NA, 1996, NA, 1991, NA, 1988, 1980, 198…
## $ member_gender <chr> "Male", NA, "Male", NA, "Male", NA, "Male", "…
## $ bike_share_for_all_trip <chr> "No", "No", "No", "No", "No", "No", "No", "No…
Merge 2017 and 2018 data
#check the dimension first
dim(fordgobike2017)
## [1] 519700 15
dim(fordgobike2018)
## [1] 1018386 16
fordgobike <- bind_rows(fordgobike2017, fordgobike2018)
head(fordgobike, 3)
#output the data as a csv file
write.csv(fordgobike, file = "./data/fordgobike.csv")
dim(fordgobike)
## [1] 1538086 16
create new variable for “age”, “year”, “month”, and “day”
fordgobike <- fordgobike %>% mutate(age = year(now()) - member_birth_year, year=year(start_time), month=month(start_time), day=day(start_time))
head(fordgobike,3)
dim(fordgobike)
## [1] 1538086 20
Create a new variable “weekday”
fordgobike <- fordgobike %>% mutate(week_day = wday(start_time) )
levels <- c("M","T","W","TH","F","SAT","SUN")
fordgobike$week_day <- factor(fordgobike$week_day, levels = levels)
head(fordgobike, 3)
dim(fordgobike)
## [1] 1538086 21
Distribution of riders by “age”, filter riders below 81, 101, and above 100.
fordgobike %>% group_by(age) %>% count()
fordgobike %>% group_by(age) %>% summary()
## duration_sec start_time end_time
## Min. : 61.0 Min. :2017-06-28 09:47:36 Min. :2017-06-28 09:52:55
## 1st Qu.: 361.0 1st Qu.:2017-11-14 10:08:31 1st Qu.:2017-11-14 10:21:12
## Median : 569.0 Median :2018-03-15 07:10:23 Median :2018-03-15 07:24:04
## Mean : 957.4 Mean :2018-02-22 12:28:46 Mean :2018-02-22 12:44:43
## 3rd Qu.: 897.0 3rd Qu.:2018-06-02 17:56:46 3rd Qu.:2018-06-02 18:19:06
## Max. :86369.0 Max. :2018-07-31 23:57:19 Max. :2018-08-01 11:00:22
##
## start_station_id start_station_name start_station_latitude
## Min. : 3.0 Length:1538086 Min. :37.31
## 1st Qu.: 28.0 Class :character 1st Qu.:37.77
## Median : 79.0 Mode :character Median :37.78
## Mean :107.7 Mean :37.77
## 3rd Qu.:173.0 3rd Qu.:37.80
## Max. :357.0 Max. :45.51
## NA's :5245
## start_station_longitude end_station_id end_station_name
## Min. :-122.44 Min. : 3.0 Length:1538086
## 1st Qu.:-122.41 1st Qu.: 27.0 Class :character
## Median :-122.40 Median : 77.0 Mode :character
## Mean :-122.36 Mean :105.6
## 3rd Qu.:-122.39 3rd Qu.:171.0
## Max. : -73.57 Max. :357.0
## NA's :5245
## end_station_latitude end_station_longitude bike_id user_type
## Min. :37.28 Min. :-122.44 Min. : 10 Length:1538086
## 1st Qu.:37.77 1st Qu.:-122.41 1st Qu.:1045 Class :character
## Median :37.78 Median :-122.40 Median :2072 Mode :character
## Mean :37.77 Mean :-122.35 Mean :2021
## 3rd Qu.:37.80 3rd Qu.:-122.39 3rd Qu.:2952
## Max. :45.51 Max. : -73.57 Max. :4307
##
## member_birth_year member_gender bike_share_for_all_trip age
## Min. :1881 Length:1538086 Length:1538086 Min. : 20.0
## 1st Qu.:1976 Class :character Class :character 1st Qu.: 31.0
## Median :1984 Mode :character Mode :character Median : 36.0
## Mean :1982 Mean : 38.2
## 3rd Qu.:1989 3rd Qu.: 44.0
## Max. :2000 Max. :139.0
## NA's :137667 NA's :137667
## year month day week_day
## Min. :2017 Min. : 1.0 Min. : 1.00 M : 0
## 1st Qu.:2017 1st Qu.: 4.0 1st Qu.: 8.00 T : 0
## Median :2018 Median : 6.0 Median :16.00 W : 0
## Mean :2018 Mean : 6.3 Mean :15.98 TH : 0
## 3rd Qu.:2018 3rd Qu.: 9.0 3rd Qu.:24.00 F : 0
## Max. :2018 Max. :12.0 Max. :31.00 (Other): 0
## NA's :1538086
skim(fordgobike)
| Name | fordgobike |
| Number of rows | 1538086 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| factor | 1 |
| numeric | 13 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| start_station_name | 0 | 1.00 | 4 | 63 | 0 | 316 | 0 |
| end_station_name | 0 | 1.00 | 4 | 63 | 0 | 316 | 0 |
| user_type | 0 | 1.00 | 8 | 10 | 0 | 2 | 0 |
| member_gender | 137326 | 0.91 | 4 | 6 | 0 | 3 | 0 |
| bike_share_for_all_trip | 519700 | 0.66 | 2 | 3 | 0 | 2 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| week_day | 1538086 | 0 | FALSE | 0 | M: 0, T: 0, W: 0, TH: 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| duration_sec | 0 | 1.00 | 957.38 | 2891.83 | 61.00 | 361.00 | 569.00 | 897.00 | 86369.00 | ▇▁▁▁▁ |
| start_station_id | 5245 | 1.00 | 107.70 | 92.97 | 3.00 | 28.00 | 79.00 | 173.00 | 357.00 | ▇▅▂▂▁ |
| start_station_latitude | 0 | 1.00 | 37.77 | 0.10 | 37.31 | 37.77 | 37.78 | 37.80 | 45.51 | ▇▁▁▁▁ |
| start_station_longitude | 0 | 1.00 | -122.36 | 0.15 | -122.44 | -122.41 | -122.40 | -122.39 | -73.57 | ▇▁▁▁▁ |
| end_station_id | 5245 | 1.00 | 105.63 | 92.60 | 3.00 | 27.00 | 77.00 | 171.00 | 357.00 | ▇▃▂▂▁ |
| end_station_latitude | 0 | 1.00 | 37.77 | 0.10 | 37.28 | 37.77 | 37.78 | 37.80 | 45.51 | ▇▁▁▁▁ |
| end_station_longitude | 0 | 1.00 | -122.35 | 0.15 | -122.44 | -122.41 | -122.40 | -122.39 | -73.57 | ▇▁▁▁▁ |
| bike_id | 0 | 1.00 | 2020.60 | 1152.29 | 10.00 | 1045.00 | 2072.00 | 2952.00 | 4307.00 | ▇▇▇▇▅ |
| member_birth_year | 137667 | 0.91 | 1981.80 | 10.56 | 1881.00 | 1976.00 | 1984.00 | 1989.00 | 2000.00 | ▁▁▁▂▇ |
| age | 137667 | 0.91 | 38.20 | 10.56 | 20.00 | 31.00 | 36.00 | 44.00 | 139.00 | ▇▂▁▁▁ |
| year | 0 | 1.00 | 2017.66 | 0.47 | 2017.00 | 2017.00 | 2018.00 | 2018.00 | 2018.00 | ▅▁▁▁▇ |
| month | 0 | 1.00 | 6.30 | 3.06 | 1.00 | 4.00 | 6.00 | 9.00 | 12.00 | ▆▆▇▃▅ |
| day | 0 | 1.00 | 15.98 | 8.78 | 1.00 | 8.00 | 16.00 | 24.00 | 31.00 | ▇▇▇▇▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| start_time | 0 | 1 | 2017-06-28 09:47:36 | 2018-07-31 23:57:19 | 2018-03-15 07:10:23 | 1538011 |
| end_time | 0 | 1 | 2017-06-28 09:52:55 | 2018-08-01 11:00:22 | 2018-03-15 07:24:04 | 1538010 |
fordgobike %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fordgobike %>% filter(age <= 100) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fordgobike %>% filter(age > 100) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Group by “gender”, “age” (below 81) and plot their histograms
fordgobike %>% group_by( member_gender, age ) %>% count()
fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 137667 rows containing non-finite values (stat_bin).
fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram(aes(y=..density..))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 137667 rows containing non-finite values (stat_bin).
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot histograms of people below 81 years old and facet by gender with relative colors
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) +
geom_histogram(position="identity") +
facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) +
geom_histogram(aes(y=..density..),position="identity") +
facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plotting the “Year”, “month”, and day of week.
fordgobike %>% ggplot(aes(x=year)) + geom_bar()
fordgobike %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .)
fordgobike %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .)
Removing geocode outliers, subset longitude and latitude, and plot the riders’ location in the Bay Area
fordgobike2018 <- fordgobike2018 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 )
fordgobike_subset <- fordgobike2018 %>% select(start_station_longitude,start_station_latitude)
fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point()
store the data as matrix, create a k-means cluster for each location (Oakland, San Jose, San francisco), and map them
tic()
registerDoParallel(cores = 8)
head(fordgobike2018)
fordgobike_subset2 <- as.matrix(fordgobike_subset)
set.seed <- 123454321
cl <- bigkmeans(fordgobike_subset2, 3, nstart=8)
cl$centers
## [,1] [,2]
## [1,] -121.8953 37.34168
## [2,] -122.2660 37.83117
## [3,] -122.4072 37.77809
fordgobike2018 <- fordgobike2018 %>%
mutate(clust = cl$cluster)
fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) +
geom_point()
toc()
## 60.854 sec elapsed
Plotting the stations
# https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering
cl$centers
## [,1] [,2]
## [1,] -121.8953 37.34168
## [2,] -122.2660 37.83117
## [3,] -122.4072 37.77809
closest.cluster <- function(x) {
cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2)))
return(which.min(cluster.dist)[1])
}
oak <- closest.cluster(c(-122.2711, 37.8044))
sj <- closest.cluster(c(-121.8953, 37.34168))
sf <- closest.cluster(c(-122.4072, 37.77809))
#Oakland stations
oakland <- fordgobike2018 %>% filter(clust == oak)
oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point() +
ggtitle("Oakland Ford Go Bike stations")
#San Jose stations
san_jose <- fordgobike2018 %>% filter(clust == sj)
san_jose %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point() +
ggtitle("San Jose Ford Go Bike stations")
#San Francisco stations
san_francisco <- fordgobike2018 %>% filter(clust == sf)
san_francisco %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point() +
ggtitle("San Francisco Ford Go Bike stations")
Register with Google Maps and locate bike stations
#register_google(key = "XXXXXXXXXXXXXXXXX-XXXXXXXXXX", write = TRUE)
#Oakland
get_map(location = c(lon=cl$centers[oak,1], lat=cl$centers[oak,2]), zoom = 12, maptype = "roadmap") %>% ggmap() +
geom_point(data = oakland, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
ggtitle("Oakland Ford Go Bike stations")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.831171,-122.26603&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-mUUagmuvZ5bE
#San Jose
get_map(location = c(lon=cl$centers[sj,1], lat=cl$centers[sj,2]), zoom = 12, maptype = "roadmap") %>% ggmap() +
geom_point(data = san_jose, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
ggtitle("San Jose Ford Go Bike stations")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.341677,-121.895287&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-mUUagmuvZ5bE
## Warning: Removed 8 rows containing missing values (geom_point).
#San Francisco
get_map(location = c(lon=cl$centers[sf,1], lat=cl$centers[sf,2]), zoom = 12, maptype = "roadmap") %>% ggmap() +
geom_point(data = san_francisco, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
ggtitle("San Francisco Ford Go Bike stations")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.778085,-122.407228&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-mUUagmuvZ5bE
Bike stations in the Bay Area
tic()
#I chose hayward to better capture San Jose
bayarea <- get_map(location = "hayward")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=hayward&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-mUUagmuvZ5bE
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=hayward&key=xxx-mUUagmuvZ5bE
ggmap(bayarea) +
geom_point(data = fordgobike2018, aes(x = start_station_longitude, y = start_station_latitude, color = clust, alpha = 0.1), size = 1, shape = 19) +
ggtitle("Bay Area Ford Go Bike stations")
## Warning: Removed 262 rows containing missing values (geom_point).
toc()
## 69.024 sec elapsed
Gender of users in the Bay Area, and then by city (Oakland = 1, San Jose = 2, San Francisco = 3)
#Duration Distribution by Gender
fordgobike2018 %>% ggplot(aes(x = member_gender, y = duration_sec/1000000)) + geom_bar(stat = "Identity") +
ylab("Duration (in Million Sec)") +
xlab("Gender") +
ggtitle("Duration Distribution By Gender")
#Duration Distribution by Gender for each city
fordgobike2018 %>%
mutate(clust =
ifelse(clust %in% closest.cluster(c(-122.2711, 37.8044)),
"Oakland",
ifelse(clust %in% closest.cluster(c(-121.8953, 37.34168)), "San Jose",
"San Francisco"))) %>%
ggplot(aes(x=member_gender, y=duration_sec/1000000)) +
geom_bar(stat="Identity") +
ggtitle("Bay Area") +
ylab("Duration (in Million Sec)") +
xlab("Gender") +
ggtitle("Duration Distribution by Gender for Each City") +
facet_grid(clust ~ .)
Plot the density histograms of ride durations in the Bay Area by gender
#density histograms of ride durations
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5581 rows containing non-finite values (stat_bin).
## Warning: Removed 5581 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).
#density histograms of ride durations logged
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#density histograms of ride durations by gender
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5581 rows containing non-finite values (stat_bin).
## Warning: Removed 5581 rows containing non-finite values (stat_density).
## Warning: Removed 8 rows containing missing values (geom_bar).
#density histograms of ride durations by gender logged
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary by gender for each city
#Oak
fordgobike2018 %>% filter(clust == 1) %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
#SJ
fordgobike2018 %>% filter(clust == 2) %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
#SF
fordgobike2018 %>% filter(clust == 3) %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))